TODO

  • Site summary data

  • Biweekly summary of variables

    • Figures plus events timeline
    • Correlation & Heatmap
    • Add case counts / fix figure
  • Case Date Imputation. about that..

  • Model for cases ~ .

  • Test high vs low traffic as FE in RE model

  • Finalize fig. 2

  • Finalize fig. 3

  • Finish written methods and results


Analysis code
# setup ---------------------------------------------
library(tidyverse)
library(here)
library(ggiraph)
library(patchwork)
library(ggtext)
library(janitor)

ggplot2::theme_set(theme_bw() + theme(
  legend.text = element_markdown(family = 'IBM Plex Sans', colour = 'gray30'),
  axis.text = element_markdown(family = 'IBM Plex Sans', colour = 'gray30'),
  axis.title = element_markdown(family = 'IBM Plex Sans',colour = 'gray30'),
))
hrbrthemes::import_plex_sans()

source(here('R/report_plots.R'))

# Functions -----------------------------------------

convert_cq_to_copies <- function(cq) 10 ** ((40.1 - cq) / 3.23)

# creates yyyy-ww label for grouping data
get_date_week <- function(x){
  y <- lubridate::year(x)
  w <- lubridate::week(x) |> str_pad(2, 'left', 0)
  str_glue("{y}-{w}")
}

# get biweekly bin date for vector of dates
get_date_biweekly <- function(x){
  day_of_week <-  lubridate::wday(x)
  case_when(
    day_of_week %in% 1:3 ~ x + 3 - day_of_week,
    TRUE ~ x + 5 - day_of_week,
  )
}

# dates 2021-2022 with biweekly groups
biweekly_date_sequence <- function(){
  tibble(
    date = seq(
      as_date('2021-01-01'), 
      as_date('2022-12-31'), 
      by = 1)
  ) |> 
    mutate(biweek = get_date_biweekly(date))
}

# lookup table for date
week_midpoint_date_lookup <- function(start = '2021-01-01', end = '2023-01-01'){
  ends = lubridate::as_date(c(start, end))
  tibble(
    date = seq(ends[1], ends[2], by = 1),
    date_week = lubridate::week(date) |> str_pad(2, 'left', '0'),
    date_year = lubridate::year(date),
    week = str_glue("{date_year}-{date_week}"),
  ) |> 
    group_by(week) |> 
    summarise(date = mean(date))
}

binom_ci <- function(x, n){
  Hmisc::binconf(x, n, method = 'wilson', alpha = 0.05) |> 
    as_tibble() |> 
    janitor::clean_names()
}

get_binom_ci <- function(data){
  data |> 
    summarise(
      x = sum(pcr_positive == 'Positive'),
      n = n(),
      binconf = map2(x, n, ~binom_ci(.x, .y))
    ) |> 
    unnest(binconf) |> 
    mutate(across(c(point_est, lower, upper), ~(.x*100) |> round(1)))
}

# Plot Elements ---------------------------------------

theme_color <- 'cornflowerblue'

# layout x-axis
scale_x_study_dates <- function(){
    scale_x_date(
      date_breaks = 'month',
      date_labels = month.name[c(9:12, 1:5)] |> strtrim(3) |> 
        paste0(' ', c(rep(2021, 4), rep(2022, 5))),
      limits = c(
        min(swabs_tidy$date_swab), 
        max(swabs_tidy$date_swab)
      ))
}
# get rid of x-axis for vertically stacked plots
theme_no_x_labels <- function(){
  theme(
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )
}
# get rid of y-axis for horizontally stacked plots
theme_no_y_labels <- function(){
  theme(
    axis.ticks.y = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank()
  )
}
# no minor grid and adjust spacing for multipanel
theme_remove_grid <- function(){
  theme(
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title.position = 'panel', 
    plot.title = element_markdown(
      vjust = -1, 
      size = 8, 
      face = 'bold', 
      family = 'IBM Plex Sans',
      colour = 'gray50',
      margin = margin(0, 0, 0, 0)),
    plot.margin = unit(c(0, 0, 0, 0), 'mm'))
}

# Data ------------------------------------------------

# swab data
swabs <- 
  read_rds(here('data/cube.rds')) |> 
  filter(str_detect(site, '^UO_')) 

# filtered swabs without controls or sponge samples
swabs_tidy <- swabs |> 
  filter(!negative_control, 
         swab_type != 'sponge',
         date_swab < '2022-04-27') |> 
  select(date_swab, site, floor, location, pcr_positive, pcr_ct, co2) |> 
  mutate(biweek = get_date_biweekly(date_swab))

# lookup table for waste water site names
lookup_ww <- tribble(
  ~site_abbr, ~site,
  'UO_AA',  'Annex Residence',
  'UO_FA',  'Faculty of Social Sciences',
  'UO_FT',  'Friel Residence',
  'UO_NA',  'Northern sampling site',
  'UO_RGN', 'Roger Guindon Hall',
  'UO_ST',  'Southern sampling site',
  'UO_TBT', 'Tabaret Hall'
)

# UOttawa waste water data from R. Delatolla
uottawa_ww <-
  readxl::read_excel(here::here('data/ww_university.xlsx')) |>
  janitor::clean_names() |> 
  mutate(sample_date = as_date(sample_date)) |> 
  filter(
    sample_date > '2021-09-15', 
    sample_date < '2022-05-05',
    !is.na(viral_copies_per_copies_pep_avg),
    site != 'UO_RGN'
    ) |> 
  left_join(lookup_ww, by = c('site' = 'site_abbr')) |> 
  mutate(
    source =
      source_name |>
      str_remove('^Data - uOttawa - ') |>
      str_remove('.xlsx$')
  ) |> 
  transmute(site,
            sample_date, 
            signal = viral_copies_per_copies_pep_avg)

# ottawa wastewater data: daily means
regional_ww <- 
  read_rds(here('data/ww_ottawa.rds')) |> 
  select(sample_date, starts_with('cov_')) |> 
  pivot_longer(contains('cov_')) |> 
  mutate(target = str_extract(name, 'cov_n.'),
         stat = str_extract(name, 'mean|sd'),
         ) |> 
  select(-name) |> 
  pivot_wider(names_from = stat, values_from = value) |> 
  mutate(.lower = mean - sd, .upper = mean + sd)   

# data from university of ottawa
wifi <- 
  read_rds(here('data/uo_wifi.rds')) |> 
  filter(date >= min(swabs$date_swab), 
         date <= max(swabs$date_swab))

# important dates for context
event_dates <- 
  tribble(
    ~event, ~start, ~end,
    'Reading Week',  '2021-10-24', '2021-10-30',
    'Holiday Break', '2021-12-22', '2022-01-04',
    'Closure',       '2022-01-04', '2022-01-31',
    'Closure',       '2022-02-16', '2022-02-21',
    'Reading Week',  '2022-02-20', '2022-02-26',
  ) |> 
  mutate(across(start:end, as_date))

# UOttawa cases data
cases <-
  read_rds(here('data/cases_rule_imputed.rds')) |> 
  select(case, role, case_date, UC:TBT) |>
  mutate(biweek = get_date_biweekly(case_date)) |> 
  nest(associated_sites = UC:TBT)

# Summaries -----------------------------------------
# 
positivity_summary <- function(swabs){
  swabs |> 
    summarise(
      n = n(),
      x = sum(pcr_positive == 'Positive', na.rm = F),
      positivity = round(100 * x / n, digits = 1),
      .groups = 'drop')
}

co2_summary <- function(swabs){
  swabs |> 
    summarise(
      n = n(),
      co2_vals = list(co2),
      co2_mean = mean(co2, na.rm = T),
      .groups = 'drop')
}
  
# overall summary
swab_summary <- 
  swabs_tidy |> positivity_summary()

# site summary
swab_summary_sites <- swabs_tidy |> 
  group_by(site) |> 
  positivity_summary()
  
# high-low traffic summary
swab_summary_location_traffic <- swabs_tidy |> 
  mutate(traffic = if_else(
    str_detect(location, 'Site 1'), 'high traffic', 'low traffic'
  )) |> 
  group_by(traffic) |> 
  positivity_summary()

## Figure 1: data aggregation --------------------------

# uottawa cases - biweekly counts 
cases_biweekly <- 
  cases |> 
  select(case, biweek, role) |> 
  group_by(biweek) |> 
  summarise(cases = n(),
            cases_student = sum(role == 'Student'),
            cases_staff = sum(role == 'Employee'),
            ) |> 
  right_join(
    biweekly_date_sequence() |> 
      filter(biweek < '2022-02-03') |> 
      distinct(biweek),
    by = 'biweek'
  ) |> 
  mutate(across(where(is.integer), ~if_else(is.na(.), 0L, .))) 

# swabs biweekly summary
swabs_biweekly <- 
  swabs_tidy |>
  group_by(biweek) |> 
  positivity_summary() |> 
  left_join(swabs_tidy |> group_by(biweek) |> co2_summary())

# swabs biweekly summary by site  
swabs_biweekly_by_site <- 
  swabs_tidy |>
  group_by(site, biweek) |> 
  positivity_summary() |> 
  left_join(swabs_tidy |> group_by(site, biweek) |> co2_summary())

# UOttawa ww biweekly means
uottawa_ww_biweekly <- 
  uottawa_ww |> 
  mutate(biweek = get_date_biweekly(sample_date)) |> 
  group_by(biweek) |> 
  summarise(signal = mean(signal),
            n = n())

# daily ww data for study period
regional_ww_daily <- 
  regional_ww |> 
  filter(
    sample_date >= min(swabs_tidy$date_swab) - 4,
    sample_date <= max(swabs_tidy$date_swab) + 4,
  ) |> 
  group_by(sample_date) |> 
  summarise(mean = mean(mean))

# regional ww biweekly means
regional_ww_biweekly <- 
  regional_ww_daily |> 
  mutate(biweek = get_date_biweekly(sample_date)) |> 
  group_by(biweek) |> 
  summarise(mean = mean(mean))

# wifi site agg
wifi_extended <- read_rds(here('data/uo_wifi.rds')) |> 
  filter(date >= min(swabs$date_swab) - 2, 
         date <= max(swabs$date_swab) + 2) |> 
  mutate(biweek = get_date_biweekly(date))

# overall wifi biweekly timeseries
wifi_biweekly <- 
  wifi_extended |> 
  group_by(biweek) |> 
  summarise(
    n = n(),
    min = min(clients, na.rm = T),
    mean = mean(clients, na.rm = T),
    max = max(clients, na.rm = T)
  )
# per building wifi biweekly timeseries
wifi_biweek_sites <- 
  wifi_extended |> 
  group_by(biweek, site) |> 
  summarise(
    n = n(),
    min = min(clients),
    mean = mean(clients),
    max = max(clients)
  )

# Map --------------------------------------------------

swab_coords <- tribble(
  ~site, ~name, ~lat, ~lng,
  'MRT', 'Morrisette Hall (MRT)', 45.423239101483546, -75.68428970961207,
  'MNT', 'Montpetit', 45.42254171025894, -75.68265871464943,
  '90U', '90 University', 45.42242555707402, -75.6850144991752,
  'DMS', 'Desmarais Bldg.', 45.42387679340988, -75.68727075448753,
  'TBT', 'Tabaret Hall', 45.42450940162542, -75.68631900184072,
  'LPR',  'Louis Pasteur Bldg.', 45.421375668614466, -75.68026389993058,
)

ww_coords <- tribble(
  ~site, ~name, ~lat, ~lng,
  'aa', 'Annex Residence', 45.42678129026445, -75.68034405960745,
  'fa', 'Faculty of Soc. Sci.', 45.421624722628515, -75.68390386012699,
  'ft', 'Friel Residence', 45.43043440080566, -75.68290766533741,
  'tbt', 'Tabaret Hall', 45.42450940162542, -75.68631900184072
  # 'na', 'North sampling Site', NA, NA,
  # 'st', 'South Sampling Site', NA, NA, # Nobody knows where this is
)

figure_sites_map <- 
  leaflet::leaflet(swab_coords, width = 600, height = 330) |> 
  leaflet::addProviderTiles(leaflet::providers$CartoDB.Positron) |> 
  leaflet::setView(lng = -75.6843, lat = 45.4235, zoom = 14) |> 
  leaflet::addCircleMarkers(~lng, ~lat, label =  ~name, popup = ~name, 
                            radius = 2, color = '#440099') |> 
  leaflet::addLabelOnlyMarkers(
    ~lng, ~lat, label =  ~site, 
    labelOptions = leaflet::labelOptions(
      noHide = T, direction = 'top', textOnly = T, 
    )) |> 
  leaflet::addCircleMarkers(~lng, ~lat, label =  ~site, 
                            radius = 2, color = '#440099') |> 
  leaflet::addCircleMarkers(
    data = ww_coords, ~lng, ~lat, label =  ~name, popup =  ~name, 
    radius = 4, color = '#f96999')


# Results ----------------------------------------------------

rs_data <- lst(
  swabs_collected = nrow(swabs_tidy),
  positivity_ovr = swab_summary$positivity,
  positivity_site_min = min(swab_summary_sites$positivity),
  positivity_site_max = max(swab_summary_sites$positivity),
)

rs_abstract <- rs_data |> 
  glue::glue_data(
    "Over the 32-week study period, we collected {swabs_collected} swabs at six university buildings, including two buildings where wastewater surveillance was currently ongoing. 
Overall, {positivity_ovr}% of swabs were PCR-positive for SARS-CoV-2, with individual buildings ranging between {positivity_site_min}% and {positivity_site_max}%.
*Comment on when spikes in cases, swabs, and waste-water occurred….* *Comment on prediction of cases using swabs, ww, or combined approach…*
"
  )

rs_results <- rs_data |> 
  glue::glue_data()
Expand for plotting details
# fig1 -----
fig1_timeline <- 
  event_dates |> 
  ggplot(aes(x = start, xend = end, y = event, yend = event)) +
  geom_segment(linewidth = 4, lineend = 'round',
               color = theme_color,
               alpha = 0.3) +
  geom_segment(linewidth = 1, lineend = 'butt',
             color = theme_color,
             alpha = 0.9) +
  geom_point(aes(x = end), color = theme_color, size = 1.6) +
  geom_point(color = theme_color, size = 1.6) +
  scale_x_study_dates() +
  labs(title = 'Timeline') +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_cases <-
  cases_biweekly |>
  # adjust bar widths manually
  mutate(biweek = if_else(wday(biweek) == 5, biweek + 0.75, biweek - 0.75)) |>
  select(biweek, cases_student, cases_staff) |>
  pivot_longer(-biweek) |>
  mutate(name = str_remove(name, 'cases_') |> str_to_title()) |>
  ggplot() +
  geom_vline(
    data = tibble(x = as_date('2022-02-03')),
    aes(xintercept = x),
    lty = 2,
    color = 'gray'
  ) +
  geom_text(
    data = tibble(
      x = as_date('2022-02-05'),
      y = 18,
      label = 'End of data'
    ),
    aes(x, y, label = label),
    size = 2,
    hjust = 0
  ) +
  geom_col(
    aes(biweek, value, fill = name),
    size = 0.15,
    width = 3.5,
    color = 'gray80',
    position = position_stack()
  ) +
  labs(fill = NULL, title = 'UOttawa COVID-19 Cases') +
  scale_fill_carto_d() +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid() +
  theme(
    legend.position = c(0.9, 0.4),
    legend.key.size = unit(2, 'mm'),
    legend.background = element_rect(color = 'grey80')
  )

fig1_swabs <- 
  swabs_biweekly |> 
  ggplot(aes(biweek, positivity)) +
  geom_smooth(
    linewidth = 0.5, 
    alpha = 0.2, 
    fill = theme_color, 
    span = 0.4,
    method = 'loess',
    formula = 'y ~ x', 
    na.rm = T) +
  geom_point(color = theme_color, alpha = 0.85, shape = 1, size = 1) +
  scale_x_study_dates() +
  labs(title = 'Swab Positivity (%)') +
  theme_no_x_labels() +
  theme_remove_grid()
  
fig1_co2 <-
  ggplot(swabs_biweekly, aes(biweek, co2_mean)) +
  geom_point(color = theme_color, shape = 1, alpha = 0.85, size = 1) +
  geom_smooth(fill = theme_color, alpha = 0.2, size = 0.5) +
  labs(x = NULL, y = NULL,
       title = 'CO~2~ (ppm)') +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid()
  
fig1_wifi <-
  wifi_biweekly |> 
  ggplot(aes(biweek, mean)) +
  geom_smooth(span = 0.5, 
              size = 0.5,
              alpha = 0.2, 
              fill = theme_color) +
  geom_point(color = theme_color,  alpha = 0.85, shape = 1, size = 1) +
  labs(title = "Wi-Fi Connections") +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_uo_ww <-
  ggplot(uottawa_ww_biweekly,
         aes(biweek, signal)) +
  geom_smooth(span = 0.4, alpha = 0.2, size = 0.5, fill = theme_color, na.rm = T) +
  geom_point(
    # aes(size = n),
    alpha = 0.85, shape = 1, size = 1,
    color = theme_color,
    na.rm = T,
    show.legend = F) +
  labs(
    title = 'University Waste-Water',
  ) +
  scale_x_study_dates() +
  scale_size(range = c(1, 3)) +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_ott_ww <- 
  ggplot(regional_ww_daily) + 
  geom_smooth(aes(sample_date, mean), 
              na.rm = T,
              method = 'loess', formula = 'y ~ x',
              span = 0.2, size = 0.5, alpha = 0.2,
              fill = theme_color) +
  geom_point(data = regional_ww_biweekly,
             aes(biweek, mean),
             na.rm = T,
             alpha = 0.85, shape = 1, size = 1,
             color = theme_color) +
  labs(x = 'Date',
       title = 'Regional Waste-Water',
       ) +
  scale_x_study_dates() +
  theme_remove_grid() +
  theme(axis.title.x = ggtext::element_markdown(lineheight = 1),
        axis.text.x = ggtext::element_markdown(size = 6.7)) 

figure_1 <- wrap_plots(
    fig1_timeline, fig1_cases, fig1_swabs, fig1_co2, 
    fig1_wifi, fig1_uo_ww, fig1_ott_ww
  ) +
  plot_layout(ncol = 1, nrow = 7, heights = c(1, rep(1.5, 6)))

rm(fig1_timeline, fig1_swabs, fig1_co2, fig1_wifi, fig1_uo_ww, fig1_ott_ww, fig1_cases)

Results

Abstract Results

cat(rs_abstract)

Over the 32-week study period, we collected 554 swabs at six university buildings, including two buildings where wastewater surveillance was currently ongoing. Overall, 13% of swabs were PCR-positive for SARS-CoV-2, with individual buildings ranging between 4.8% and 32.7%. Comment on when spikes in cases, swabs, and waste-water occurred…. Comment on prediction of cases using swabs, ww, or combined approach…


Written Results

cat(rs_results)

Summary Tables

# stack summaries
swab_summary |> 
  mutate(site = 'Overall') |> 
  bind_rows(swab_summary_sites) |> 
  relocate(site) |> 
  set_names(c('Site', 'N', 'PCR-Positive', '%')) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = 'Building Summary'
  )
Building Summary
Site N PCR-Positive %
Overall 554 72 13.0
UO_90U 98 32 32.7
UO_DMS 86 5 5.8
UO_MRT 90 13 14.4
UO_MNT 98 9 9.2
UO_TBT 84 4 4.8
UO_LPR 98 9 9.2

Figures

(figure_1) 

Figure 1: Time-series of (top to bottom) important events at the university, proportion of PCR-positive swabs, biweekly mean ambient CO2 (across collection sites), biweekly mean peak number of Wi-Fi connections (across 5 buildings), biweekly mean waste-water signal (across up to 6 collection sites; relative to PPMoV), biweekly mean regional waste-water detection (relative to PPMoV). Points show biweekly means. Trend lines show the LOESS fit.



# joined data for pairwise correlation 
biweekly_data <- swabs_biweekly |> 
  transmute(biweek, `Swab PCR` = positivity, `CO2` = co2_mean) |> 
  left_join(
    wifi_biweekly |> transmute(biweek, `Wi-Fi` = mean),
    by = 'biweek'
    ) |> 
  left_join(
    uottawa_ww_biweekly |> transmute(biweek, `University WW` = signal),
    by = 'biweek'
  ) |> 
  left_join(
    regional_ww_biweekly |> transmute(biweek, `Ottawa WW` = mean),
    by = 'biweek'
  ) |> 
  left_join(
    cases_biweekly |> transmute(biweek, Cases = cases),
    by = 'biweek'
  )

corr_spearman <- corrr::correlate(
  biweekly_data, method = 'spearman')

figure_2 <- corrr::autoplot(corr_spearman, triangular = "lower",
                             barheight = 10) +
  geom_text(aes(label = r |> round(2)), size = 3.5)
  
figure_2

rm(corr_spearman)

Figure 2: Spearman correlation between pairs of biweekly measures.



Figure 3. (like Fig1 but disaggregated)

## thematic elements and axis scales -----
# limits are based on the min/max of data to keep scales consistent across all plots of a given variable
scale_y_co2 <- function() scale_y_continuous(limits = c(400, 950))  
scale_y_copies <- function() scale_y_log10(limits = c(1, 300))
scale_y_cases <- function() {
  scale_y_continuous(breaks = seq(0, 5, 2), limits = c(0,5))
}
scale_y_wifi <- function() scale_y_continuous(limits = c(0, 1000))

scale_color_traffic <- function(){
  scale_color_carto_d(
    palette = 2,
    breaks = c('High', 'Low'),
    labels = c('High', 'Low'),
    drop = F
  )
}
theme_axes_lb <- function(){
  theme_remove_grid() +
  theme(
    panel.border = element_blank(),
    axis.line = element_line(colour = "gray30", linewidth = rel(0.5)),
    plot.margin = margin(2,2,2,2, 'mm')
  )
}

remove_y_axis_if_not_leftmost <- function(p, site){
  if (site %in% c('DMS', 'MRT')) return(p)
  p + theme_no_y_labels()
}

## individual plots ====
plot_site_cases <- function(data, site){
  p <- data |>
    ggplot(aes(biweek, cases, 
               fill = factor(role, levels = c('Employee', 'Student')))
           ) +
    geom_col(linewidth = .1, width = 2.5, color = 'grey70') +
    geom_vline(data = tibble(xint = as_date('2022-02-02')),
               aes(xintercept = xint), lty = 2, alpha = 0.35) +
    scale_x_study_dates() +
    scale_y_cases() +
    scale_fill_carto_d(
      palette = 1,  
      breaks = c('Employee', 'Student'),
      labels = c('Employee', 'Student'),
      drop = FALSE,
      name = 'Role') +
    labs(fill = 'Cases', y = 'Cases', x = NULL, title = site) +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown()) 
  remove_y_axis_if_not_leftmost(p, site)
}

# swabs
plot_site_swabs <- function(data, site){
  data_early <- data |> filter(date_swab < '2022-01-01')
  data_late <- data |> filter(date_swab > '2022-01-01')
  
  p <- data_early |>
    ggplot(aes(date_swab, copies_plus_one, color = traffic)) +
    geom_path(alpha = 0.8, linewidth = 0.6) +
    geom_point(size = 0.6, alpha = 0.8) +
    geom_path(data = data_late, alpha = 0.8, linewidth = 0.6) +
    geom_point(data = data_late, size = 0.6, alpha = 0.8) +
    scale_x_study_dates() +
    scale_y_copies() +
    scale_color_traffic() +
    labs(x = NULL, y = 'Copies + 1', color = 'Traffic Level') +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown(),
          legend.text = element_markdown())
  remove_y_axis_if_not_leftmost(p, site)
}

# co2
plot_site_co2 <- function(data, site){
  data_early <- data |> filter(date_swab < '2022-01-01')
  data_late <- data |> filter(date_swab > '2022-01-01')
  
  p <- data_early |> 
    ggplot(aes(date_swab, co2, color = traffic)) +
    geom_path(alpha = 0.8) +
    geom_point(alpha = 0.8, size = 0.6) +
    geom_path(data = data_late, alpha = 0.8) +
    geom_point(data = data_late, alpha = 0.8, size = 0.6) +
    scale_x_study_dates() +
    scale_y_co2() +
    scale_color_traffic() +
    labs(y = 'CO~2~', x = NULL) +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown(), legend.position = 'none')
  remove_y_axis_if_not_leftmost(p, site)
}

# wifi
plot_site_wifi <- function(data, site){
  if (is.null(data)) return(patchwork::plot_spacer())
  
  p <- data |> 
    ggplot(aes(date, clients)) +
    geom_path(alpha = 0.5) +
    scale_x_study_dates() +
    scale_y_wifi() +
    labs(y = 'Wifi', x = NULL) +
    theme_axes_lb() +
    theme(
      axis.title.y = element_markdown(),
      axis.text.x = element_markdown(size = 6))
  remove_y_axis_if_not_leftmost(p, site)
}

## Plot datasets ====
fig3_cases <- 
  cases |> 
  unnest(associated_sites) |> 
  pivot_longer(UC:TBT, names_to = 'site') |> 
  mutate(site = if_else(site == 'UC', '90U', site)) |> 
  filter(value, 
         case_date > min(swabs_tidy$date_swab - 5),
         case_date < max(swabs_tidy$date_swab + 5),
         ) |> 
  group_by(site, biweek, role) |> 
  summarise(cases = n(),
            dates = list(case_date),
            .groups = 'drop') |>
  group_nest(site, .key = "cases")

fig3_swabs <- 
  swabs_tidy |> 
  mutate(
    site = str_remove(site, 'UO_'),
    traffic = if_else(str_detect(location, 'Site 1'),
                      'High', 'Low'),
    copies = convert_cq_to_copies(pcr_ct),
    copies_plus_one = if_else(is.na(copies), 1, copies + 1),
    ) |> 
  select(site, traffic, date_swab, copies_plus_one) |>
  group_nest(site, .key = "swabs")

fig3_wifi <- 
  wifi |> 
  mutate(site = str_remove(site, 'UO_')) |> 
  select(date, site, clients) |>
  group_nest(site, .key = "wifi")

fig3_co2 <- 
   swabs_tidy |> 
  transmute(
    site = str_remove(site, 'UO_'),
    traffic = if_else(str_detect(location, 'Site 1'), 'High', 'Low'),
    date_swab,
    co2,
    ) |> 
  group_nest(site, .key = "co2") |> 
  print()
## # A tibble: 6 × 2
##   site                 co2
##   <chr> <list<tibble[,3]>>
## 1 90U             [98 × 3]
## 2 DMS             [86 × 3]
## 3 LPR             [98 × 3]
## 4 MNT             [98 × 3]
## 5 MRT             [90 × 3]
## 6 TBT             [84 × 3]
# combine nested df
fig3_df_nest <- fig3_cases |> 
  left_join(fig3_swabs, by = join_by(site)) |> 
  left_join(fig3_wifi, by = join_by(site)) |> 
  left_join(fig3_co2, by = join_by(site))

## map data to plots, combine into panel for each site ====
fig3_plts <- fig3_df_nest |>
  arrange(site == '90U') |> 
  transmute(
    site,
    plt_cases = map2(cases, site, plot_site_cases),
    plt_swabs = map2(swabs, site, plot_site_swabs),
    plt_co2 = map2(co2, site, plot_site_co2),
    plt_wifi = map2(wifi, site, plot_site_wifi),
  ) |> 
  mutate(panel = pmap(
    .l = list(site, plt_cases, plt_swabs, plt_co2, plt_wifi),
    .f = function(site,plt_cases, plt_swabs, plt_co2, plt_wifi) {
      p <- 
        patchwork::wrap_plots(
          plt_cases, plt_swabs, plt_co2, plt_wifi, ncol = 1
        ) 
      
      if (site %in% c('90U', 'MNT')) return(p)
      p + patchwork::plot_annotation(theme = theme(
        axis.title.y = element_blank(),
        axis.text.y = element_blank()
      ))
    }))

## Combine all 6 sites into 2*3 panel
(
  figure_3 <- 
    patchwork::wrap_plots(
      fig3_plts$panel, 
      tag_level = 'keep',
      guides = 'collect'
    )
)

Figure 3. Campus COVID-19 cases, swab results, CO2 concentrations (during swab collection), and daily peak Wi-Fi connections by building. Cases plots show counts of self-reported for students and employees separately; several cases are associated with multiple buildings. Case data collection was abandoned in early February 2022 (dashed line). Swabs and CO plots show results at two locations within each building, with one sample collected in a high-traffic area and the other in a low-traffic area. Swab PCR results are expressed as the number of SARS-CoV-2 RNA copies plus one, on a log-scale. Points represent the result for a single swab. Wi-fi plots show the peak daily number of simultaneous connections per building. No Wi-Fi data was available for the 90U building.

## save figures ----
# doesn't work as a pdf for some reason
ggsave('fig/figure_1.png', figure_1, device = 'png', width = 7, height = 7) 
ggsave('fig/figure_2.pdf', figure_2, device = 'pdf', height = 4, width = 4)
ggsave('fig/figure_2.png', figure_2, device = 'png', height = 4, width = 4)
ggsave('fig/figure_3.pdf', figure_3, device = 'pdf', height = 8, width = 12)
ggsave('fig/figure_3.png', figure_3, device = 'png', height = 8, width = 12)
# ggsave()
# 

r figure_sites_map
{=html} <div id="htmlwidget-a848a84d7f9293835f74" style="width:600px;height:330px;" class="leaflet html-widget"></div> <script type="application/json" data-for="htmlwidget-a848a84d7f9293835f74">{"x":{"options":{"crs":{"crsClass":"L.CRS.EPSG3857","code":null,"proj4def":null,"projectedBounds":null,"options":{}}},"calls":[{"method":"addProviderTiles","args":["CartoDB.Positron",null,null,{"errorTileUrl":"","noWrap":false,"detectRetina":false}]},{"method":"addCircleMarkers","args":[[45.4232391014835,45.4225417102589,45.422425557074,45.4238767934099,45.4245094016254,45.4213756686145],[-75.6842897096121,-75.6826587146494,-75.6850144991752,-75.6872707544875,-75.6863190018407,-75.6802638999306],2,null,null,{"interactive":true,"className":"","stroke":true,"color":"#440099","weight":5,"opacity":0.5,"fill":true,"fillColor":"#440099","fillOpacity":0.2},null,null,["Morrisette Hall (MRT)","Montpetit","90 University","Desmarais Bldg.","Tabaret Hall","Louis Pasteur Bldg."],null,["Morrisette Hall (MRT)","Montpetit","90 University","Desmarais Bldg.","Tabaret Hall","Louis Pasteur Bldg."],{"interactive":false,"permanent":false,"direction":"auto","opacity":1,"offset":[0,0],"textsize":"10px","textOnly":false,"className":"","sticky":true},null]},{"method":"addMarkers","args":[[45.4232391014835,45.4225417102589,45.422425557074,45.4238767934099,45.4245094016254,45.4213756686145],[-75.6842897096121,-75.6826587146494,-75.6850144991752,-75.6872707544875,-75.6863190018407,-75.6802638999306],{"iconUrl":{"data":"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP6zwAAAgcBApocMXEAAAAASUVORK5CYII=","index":0},"iconWidth":1,"iconHeight":1},null,null,{"interactive":true,"draggable":false,"keyboard":true,"title":"","alt":"","zIndexOffset":0,"opacity":1,"riseOnHover":false,"riseOffset":250},null,null,null,null,["MRT","MNT","90U","DMS","TBT","LPR"],{"interactive":false,"permanent":true,"direction":"top","opacity":1,"offset":[0,0],"textsize":"10px","textOnly":true,"className":"","sticky":true},null]},{"method":"addCircleMarkers","args":[[45.4232391014835,45.4225417102589,45.422425557074,45.4238767934099,45.4245094016254,45.4213756686145],[-75.6842897096121,-75.6826587146494,-75.6850144991752,-75.6872707544875,-75.6863190018407,-75.6802638999306],2,null,null,{"interactive":true,"className":"","stroke":true,"color":"#440099","weight":5,"opacity":0.5,"fill":true,"fillColor":"#440099","fillOpacity":0.2},null,null,null,null,["MRT","MNT","90U","DMS","TBT","LPR"],{"interactive":false,"permanent":false,"direction":"auto","opacity":1,"offset":[0,0],"textsize":"10px","textOnly":false,"className":"","sticky":true},null]},{"method":"addCircleMarkers","args":[[45.4267812902645,45.4216247226285,45.4304344008057,45.4245094016254],[-75.6803440596074,-75.683903860127,-75.6829076653374,-75.6863190018407],4,null,null,{"interactive":true,"className":"","stroke":true,"color":"#f96999","weight":5,"opacity":0.5,"fill":true,"fillColor":"#f96999","fillOpacity":0.2},null,null,["Annex Residence","Faculty of Soc. Sci.","Friel Residence","Tabaret Hall"],null,["Annex Residence","Faculty of Soc. Sci.","Friel Residence","Tabaret Hall"],{"interactive":false,"permanent":false,"direction":"auto","opacity":1,"offset":[0,0],"textsize":"10px","textOnly":false,"className":"","sticky":true},null]}],"setView":[[45.4235,-75.6843],14,[]],"limits":{"lat":[45.4213756686145,45.4304344008057],"lng":[-75.6872707544875,-75.6802638999306]}},"evals":[],"jsHooks":[]}</script>
Map: Where are these collection sites? Pink markers represent waste-water collection sites; purple markers represent buildings where surface testing was performed. Only one building, Tabaret Hall, had both surface and waste-water testing. The locations of two waste-water sampling sites could not be ascertained and are not displayed on the map.
It remains unclear which buildings the ‘South Sampling Site’ and ‘North Sampling Site’ catchment areas are related to (not mapped).


Models

  • Hypothesis testing. HA: swab-PCR positivity, CO2, wifi, and WW signals are predictive of number of cases. H0: no relationship.
  • Method: Backwards selection of significant predictors, ANOVA
  • Formula: cases ~ swab_pcr + co2 + wifi + {university_ww} + regional_ww + (all interactions...).
  • Issue: only 10 / 50 biweekly obs are complete cases
    • cases counted only until Feb 2022 (~40% miss).
    • university_ww from Dec 2021 (~40% miss).
    • wifi occasional missing data.
  • Question: do we want standardized effects or simple effects? (I standardized because we are mostly interested in comparing different predictors within a model, as opposed to across different data sets.)
  • Question: do we want to consider interactions? (I did and didn’t)


# n=50 biweekly obs; uni_ww missing first half of period...
df_mod <- biweekly_data |> 
  janitor::clean_names() |> 
  mutate(across(-c(cases, biweek), scale)) # rescale

# only 10 complete observations, oof
df_mod_complete_obs <- df_mod |> na.omit()

# missing data
naniar::vis_miss(df_mod) +
  labs(subtitle = 'Aggregated TS Missingness')



Case count distribution

Cases counts seem to follow a negative binomial distribution more than Poisson.

# simulated curves
add_poisson_density_curve <- function(p) {
  p + geom_density(
    data = tibble(
      x = rpois(n = sum(!is.na(df_mod$cases)),
                lambda = mean(df_mod$cases, na.rm = T))
      ), 
    aes(x), color = 'blue', alpha = 0.25, size = 0.03
  )
}
add_nb_density_curve <- function(p) {
  p +     
    geom_density(
      data = tibble(x = rnbinom(
        size = 1,
        n = sum(!is.na(df_mod$cases)),
        mu = mean(df_mod$cases, na.rm = T),
      )), 
      aes(x), color = 'blue', alpha = 0.25, size = 0.03
    )
}
simulation_plot <- function(plt_fn, title){
  p <- df_mod |> ggplot()
  walk(seq(100), ~ {p <<- plt_fn(p)})
  p + geom_density(aes(cases)) + labs(x = 'Cases', subtitle = title)
}

# Poisson dist
p1 <- simulation_plot(
  add_poisson_density_curve,
  'Cases vs. Simulated poisson distribution'
  )

# NB dist
p2 <- simulation_plot(
  add_nb_density_curve,
  'Cases vs. Simulated negative binomial distribution'
  )

(p1 / p2)

rm(p1, p2, add_poisson_density_curve, add_nb_density_curve)



Poisson Regression, Backwards Selection


Poisson Model with all interactions

Backwards selection arrived at the model with swab_pcr, wifi, ottawa_ww, and the interaction between ottawa_ww & swab_pcr.

# 30 complete obs if excluding univeristy ww entirely
df_mod_no_uniww <- df_mod |> select(-university_ww) |> na.omit()

# backwards select with n=30, no university ww
full_model <- glm(
  cases ~ . ^ 2,
  data = df_mod_no_uniww |> select(-biweek), 
  family = 'poisson')

# do backwards selection
backwards_selected <- step(full_model, trace = F)
stopifnot(backwards_selected$converged)

# standardized coefs and 95% CI
broom::tidy(backwards_selected, conf.int = T, conf.level = 0.95) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Poisson model with interactions: Coefficients & 95% CI"
    )
Poisson model with interactions: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 1.1770556 0.2318347 5.077133 0.0000004 0.6901184 1.6032567
swab_pcr 0.9139300 0.2028453 4.505551 0.0000066 0.5190708 1.3208155
wi_fi 0.9444259 0.3402044 2.776054 0.0055023 0.2809884 1.6216141
ottawa_ww 2.2294187 0.5457966 4.084706 0.0000441 1.1978919 3.3464615
swab_pcr:ottawa_ww -0.5504876 0.1853158 -2.970538 0.0029728 -0.9276135 -0.1978697
# use type III SS tests since swab*ww interaction is significant
car::Anova(backwards_selected, type = 3) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
    kableExtra::kable(
    format = 'pipe',
    caption = "Type III ANOVA"
    )
Type III ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 19.958024 1 0.0000079
wi_fi 7.782973 1 0.0052741
ottawa_ww 19.297456 1 0.0000112
swab_pcr:ottawa_ww 9.616743 1 0.0019281
# test model assumption of equidispersion
# p > 0.05 means no significant overdispersion
AER::dispersiontest(backwards_selected)
## 
##  Overdispersion test
## 
## data:  backwards_selected
## z = 0.51985, p-value = 0.3016
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.120566
Backward selection model summary and diagnostics
summary(backwards_selected)
## 
## Call:
## glm(formula = cases ~ swab_pcr + wi_fi + ottawa_ww + swab_pcr:ottawa_ww, 
##     family = "poisson", data = select(df_mod_no_uniww, -biweek))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8448  -0.8735  -0.6446   0.3903   2.1520  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.1771     0.2318   5.077 3.83e-07 ***
## swab_pcr             0.9139     0.2028   4.506 6.62e-06 ***
## wi_fi                0.9444     0.3402   2.776  0.00550 ** 
## ottawa_ww            2.2294     0.5458   4.085 4.41e-05 ***
## swab_pcr:ottawa_ww  -0.5505     0.1853  -2.971  0.00297 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 182.217  on 29  degrees of freedom
## Residual deviance:  30.924  on 25  degrees of freedom
## AIC: 90.491
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected)

hist(backwards_selected$residuals)

rm(full_model)



Poisson Main Effects Only

Poisson model chosen by backwards selection with no interactions identifies swab positivity and Ottawa WW signal as significant predictors.

# backwards select with n=30, no university ww
me_model <- glm(
  cases ~ swab_pcr + co2 + wi_fi + ottawa_ww,
  data = df_mod_no_uniww, 
  family = 'poisson')

backwards_selected_me <- step(me_model, trace = F)
stopifnot(backwards_selected_me$converged)

# nearly significant overdispersion when only main effects included
AER::dispersiontest(backwards_selected_me)
## 
##  Overdispersion test
## 
## data:  backwards_selected_me
## z = 1.6019, p-value = 0.05459
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.346469
# regression coefs and 95%CI
broom::tidy(backwards_selected_me, conf.int = T, conf.level = 0.95) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Main Effects Only: Coefficients & 95% CI"
    )
Main Effects Only: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.5610319 0.1637298 3.426571 0.0006113 0.2208496 0.8646110
swab_pcr 0.4707155 0.1470632 3.200770 0.0013706 0.1802482 0.7566573
ottawa_ww 0.8006168 0.2506409 3.194279 0.0014018 0.3175856 1.2995081
# type II SS test
car::Anova(backwards_selected_me, type = 2) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Type II ANOVA"
  )
Type II ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 10.00096 1 0.0015646
ottawa_ww 10.80594 1 0.0010118
Backward selection model summary and diagnostics
summary(backwards_selected_me)
## 
## Call:
## glm(formula = cases ~ swab_pcr + ottawa_ww, family = "poisson", 
##     data = df_mod_no_uniww)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6786  -1.2634  -0.8722   0.5679   2.1224  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5610     0.1637   3.427 0.000611 ***
## swab_pcr      0.4707     0.1471   3.201 0.001371 ** 
## ottawa_ww     0.8006     0.2506   3.194 0.001402 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 182.22  on 29  degrees of freedom
## Residual deviance:  41.42  on 27  degrees of freedom
## AIC: 96.987
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected_me)

hist(backwards_selected_me$residuals)

rm(me_model)



NB model, backwards selection

NB model with main effects also identifies cases ~ swab + ottawa_ww as optimal model (as with Poisson model). NB fit didn’t converge when all interactions are included in the model.


Main effects, no interactions

# negative binomial regression; main effects only
full_model_nb <- MASS::glm.nb(
  cases ~ ., 
  data = df_mod_no_uniww |> select(-biweek),
  control = glm.control(maxit = 5000)
  )

backwards_selected_nb <- step(full_model_nb, trace = F)
stopifnot(backwards_selected_nb$converged)

broom::tidy(backwards_selected_nb, conf.int = T, conf.level = 0.95) |> 
  rownames_to_column(var = 'Term') |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "NB model -no interactions: Coefficients & 95% CI"
    )
NB model -no interactions: Coefficients & 95% CI
Term term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 0.5513504 0.1708256 3.227564 0.0012485 0.1989048 0.8680663
2 swab_pcr 0.4839454 0.1620504 2.986389 0.0028229 0.1638807 0.7989784
3 ottawa_ww 0.8058500 0.2713415 2.969874 0.0029792 0.2810875 1.3443150
Model summary and diagnostics
# not the worst fit, could be much better
summary(backwards_selected_nb)
## 
## Call:
## MASS::glm.nb(formula = cases ~ swab_pcr + ottawa_ww, data = select(df_mod_no_uniww, 
##     -biweek), control = glm.control(maxit = 5000), init.theta = 31.03709038, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4918  -1.2413  -0.8631   0.5150   2.0839  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.5514     0.1708   3.228  0.00125 **
## swab_pcr      0.4839     0.1621   2.986  0.00282 **
## ottawa_ww     0.8059     0.2713   2.970  0.00298 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(31.0371) family taken to be 1)
## 
##     Null deviance: 163.247  on 29  degrees of freedom
## Residual deviance:  38.936  on 27  degrees of freedom
## AIC: 98.879
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  31.0 
##           Std. Err.:  91.3 
## 
##  2 x log-likelihood:  -90.879
plot(backwards_selected_nb)

hist(backwards_selected_nb$residuals)

rm(full_model_nb)

The negative binomial model (with extra parameter \(\theta\)) does not significantly improve the log likelihood compared to the poisson model (p > 0.05).

library(lmtest)
# poisson vs. nb models with main effects only
lmtest::lrtest(backwards_selected_me, backwards_selected_nb)
## Likelihood ratio test
## 
## Model 1: cases ~ swab_pcr + ottawa_ww
## Model 2: cases ~ swab_pcr + ottawa_ww
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   3 -45.494                     
## 2   4 -45.440  1 0.1083     0.7421
NB fit fails with interactions
# swab pcr and 
full_model_nb_int <- MASS::glm.nb(
  cases ~ .*., 
  data = df_mod_no_uniww |> select(-biweek) |> na.omit(),
  control = glm.control(maxit = 15000)
  )
# do selection
try((backwards_selected_nb_int <- step(full_model_nb_int)))
## Start:  AIC=-234
## cases ~ (swab_pcr + co2 + wi_fi + ottawa_ww) * (swab_pcr + co2 + 
##     wi_fi + ottawa_ww)
## 
##                      Df Deviance     AIC
## - swab_pcr:wi_fi      1   11.247 -252.49
## - swab_pcr:co2        1   20.301 -243.44
## - co2:ottawa_ww       1   20.377 -243.36
## - wi_fi:ottawa_ww     1   21.188 -242.55
## - co2:wi_fi           1   27.794 -235.94
## <none>                    27.737 -234.00
## - swab_pcr:ottawa_ww  1   39.531 -224.21
## 
## Step:  AIC=98.26
## cases ~ swab_pcr + co2 + wi_fi + ottawa_ww + swab_pcr:co2 + swab_pcr:ottawa_ww + 
##     co2:wi_fi + co2:ottawa_ww + wi_fi:ottawa_ww
## 
## Call:  MASS::glm.nb(formula = cases ~ swab_pcr + co2 + wi_fi + ottawa_ww + 
##     swab_pcr:co2 + swab_pcr:ottawa_ww + co2:wi_fi + co2:ottawa_ww + 
##     wi_fi:ottawa_ww, data = na.omit(select(df_mod_no_uniww, -biweek)), 
##     control = glm.control(maxit = 15000), init.theta = 261853298.3, 
##     link = log)
## 
## Coefficients:
##        (Intercept)            swab_pcr                 co2               wi_fi  
##            2.16837             1.28306             0.10333             1.64716  
##          ottawa_ww        swab_pcr:co2  swab_pcr:ottawa_ww           co2:wi_fi  
##            3.81150             0.43175            -0.43881             0.08261  
##      co2:ottawa_ww     wi_fi:ottawa_ww  
##           -0.56701             1.09738  
## 
## Degrees of Freedom: 29 Total (i.e. Null);  20 Residual
## Null Deviance:       182.2 
## Residual Deviance: 28.69     AIC: 100.3
rm(full_model_nb_int)



Model with university WW

If we consider the ten biweekly observations where we have linked cases, swabs, and university WW data,

# backwards select with university ww (n=10 biweekly obs)
full_model_w_uni_ww <- glm(
  cases ~ .,
  data = df_mod_complete_obs |> select(-biweek), 
  family = 'poisson')

# do backwards selection
backwards_selected_ww <- step(full_model_w_uni_ww, trace = F)
stopifnot(backwards_selected_ww$converged)


# standardized coefs and 95% CI
broom::tidy(backwards_selected_ww, conf.int = T, conf.level = 0.95) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Poisson model with interactions: Coefficients & 95% CI"
    )
Poisson model with interactions: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.7045672 0.3065951 2.298038 0.0215596 0.0382917 1.2493210
swab_pcr 0.3073919 0.1589153 1.934313 0.0530747 -0.0055133 0.6198578
university_ww 0.1389517 0.0823339 1.687660 0.0914765 -0.0317802 0.2962807
ottawa_ww 0.9383949 0.3147511 2.981387 0.0028695 0.3615362 1.6072932
# use type II SS test
car::Anova(backwards_selected_ww, type = 2) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
    kableExtra::kable(
    format = 'pipe',
    caption = "Type II ANOVA"
    )
Type II ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 3.708366 1 0.0541404
university_ww 2.606515 1 0.1064254
ottawa_ww 10.992750 1 0.0009147
# test model assumption of equidispersion
# p > 0.05 means no significant overdispersion
AER::dispersiontest(backwards_selected_ww)
## 
##  Overdispersion test
## 
## data:  backwards_selected_ww
## z = -1.0769, p-value = 0.8592
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.6770926
Model summary and diagnostics
summary(backwards_selected_ww)
## 
## Call:
## glm(formula = cases ~ swab_pcr + university_ww + ottawa_ww, family = "poisson", 
##     data = select(df_mod_complete_obs, -biweek))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.54583  -0.95196   0.04579   0.50891   1.11356  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    0.70457    0.30660   2.298  0.02156 * 
## swab_pcr       0.30739    0.15892   1.934  0.05307 . 
## university_ww  0.13895    0.08233   1.688  0.09148 . 
## ottawa_ww      0.93839    0.31475   2.981  0.00287 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 61.8835  on 9  degrees of freedom
## Residual deviance:  7.8612  on 6  degrees of freedom
## AIC: 46.283
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected_ww)

hist(backwards_selected_ww$residuals)

rm(full_model_w_uni_ww)

GLMM: High v Low Traffic

swab_summary_location_traffic |> 
  set_names(c('Traffic level', 'N', 'PCR-Positive', '%')) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = 'Positivity by sample location traffic-level'
  )
Positivity by sample location traffic-level
Traffic level N PCR-Positive %
high traffic 280 40 14.3
low traffic 274 32 11.7

To evaluate whether high traffic locations are different from low traffic sites, in terms of swab-PCR positivity, we fit a mixed model with random intercepts for different sites. There appears to be little difference (15% vs 12% overall) and this is confirmed by our mixed model with traffic level as a fixed effect (OR 0.71; 95% CI 0.82-2.2).

traffic_swabs <- 
  swabs_tidy |> 
  mutate(
    traffic = if_else(
      str_detect(location, 'Site 1'), 'high', 'low') |> 
      factor(levels = c('low', 'high'))
    ) |> 
  select(site, traffic, pcr_positive)

mixed_model <- 
  lme4::glmer(pcr_positive ~ traffic + (1 | site), data = traffic_swabs,
            family = 'binomial')

broom.mixed::tidy(mixed_model, conf.int = T, exponentiate = T) |> 
  kableExtra::kable(format = 'pipe',
                    caption = 'Traffic level - Random intercepts model')
Traffic level - Random intercepts model
effect group term estimate std.error statistic p.value conf.low conf.high
fixed NA (Intercept) 0.1059831 0.0376951 -6.3105362 0.0000000 0.0527824 0.2128063
fixed NA traffichigh 1.2799870 0.3340917 0.9457426 0.3442799 0.7674179 2.1349081
ran_pars site sd__(Intercept) 0.7081809 NA NA NA NA NA

Case Data Imputation…

Multiple imputation (mice) has a key assumption that data is MAR. But data is arguably not MAR, but because of some dynamics not present in the data.

Using mice means making the analysis much more complicated… Generally, you need to model each imputed dataset and then analyze the pooled results. That gets a lot more complicated when needing to first aggregate the data, join to a bunch of other data, do various modelling tasks…. all nested within each iteration.

Instead, I used a simpler ad-hoc method:

  • if positive-test-result date is available, use that, else:
  • use -5 days from end-of-isolation date, or if not available,
  • use the test date as is, or if not available,
  • use +3 days from symptoms-began date

It’s not really the most sound approach, but the data is questionable to begin with…